Measurement, Latent Factors and Theory Construction

Confirmatory Factor Analysis in PyMC

cfa
sem
measurment
library(lavaan)
library(dplyr)
library(reticulate)
library(marginaleffects)
library(modelsummary)
library(ggplot2)
library(egg)
library(lme4)
library(semPlot)
library(tinytable)
library(kableExtra)
library(reshape2)
reticulate::py_run_string("import pymc as pm")

options(rstudio.python.installationPath = "/Users/nathanielforde/mambaforge/envs")
options("modelsummary_factory_default" = "tinytable")
options(repr.plot.width=15, repr.plot.height=8)

knitr::knit_engines$set(python = reticulate::eng_python)
options(scipen=999)
set.seed(130)
Warning

This is a work in progress. We intend to elaborate the choices related to analysing and understanding the data elicted through psychometrics surveys. Psychometric phenomena are inherently complex an imprecisely measured. Special attention needs to the assumptions built into the methodologies used to analyse this species of data.

Measurment and Measurement Constructs

Science is easier when there is a recipe - some formula to follow or a procedure to adopt. You can out-source the responsibility for theory construction and methodological justification. Masking implausible nonsense with well-received mathematical gloss of “statistical significance”. Seen from 1000 feet, this is not surprising. Lip-service is paid to the idea of scientific method and we absolve ourselves of the requirement for principled justification and substantive argument. Evidence is marshalled in service to argument and it’s an absurd pretense to claim that data speaks for itself.

df = read.csv('sem_data.csv')
inflate <- df$region == 'west'
noise <- rnorm(sum(inflate), 0.5, 1) # generate the noise to add
df$ls_p3[inflate] <- df$ls_p3[inflate] + noise
df$ls_sum <- df$ls_p1 + df$ls_p2 + df$ls_p3
df$ls_mean <- rowMeans(df[ c('ls_p1', 'ls_p2', 'ls_p3')])

head(df) |> kable()
ID region gender age se_acad_p1 se_acad_p2 se_acad_p3 se_social_p1 se_social_p2 se_social_p3 sup_friends_p1 sup_friends_p2 sup_friends_p3 sup_parents_p1 sup_parents_p2 sup_parents_p3 ls_p1 ls_p2 ls_p3 ls_sum ls_mean
1 west female 13 4.857143 5.571429 4.500000 5.80 5.500000 5.40 6.5 6.5 7.0 7.0 7.0 6.0 5.333333 6.75 7.647112 19.73044 6.576815
2 west male 14 4.571429 4.285714 4.666667 5.00 5.500000 4.80 4.5 4.5 5.5 5.0 6.0 4.5 4.333333 5.00 4.469623 13.80296 4.600985
10 west female 14 4.142857 6.142857 5.333333 5.20 4.666667 6.00 4.0 4.5 3.5 7.0 7.0 6.5 6.333333 5.50 4.710020 16.54335 5.514451
11 west female 14 5.000000 5.428571 4.833333 6.40 5.833333 6.40 7.0 7.0 7.0 7.0 7.0 7.0 4.333333 6.50 5.636198 16.46953 5.489844
12 west female 14 5.166667 5.600000 4.800000 5.25 5.400000 5.25 7.0 7.0 7.0 6.5 6.5 7.0 5.666667 6.00 5.266592 16.93326 5.644419
14 west male 14 4.857143 4.857143 4.166667 5.20 5.000000 4.20 5.5 6.5 7.0 6.5 6.5 6.5 5.000000 5.50 5.913709 16.41371 5.471236

Candidate Structure

#| code-fold: true
#| code-summary: "Show the code"

datasummary_skim(df)|> 
 style_tt(
   i = 15:17,
   j = 1:1,
   background = "#20AACC",
   color = "white",
   italic = TRUE) |> 
 style_tt(
   i = 18:19,
   j = 1:1,
   background = "#2888A0",
   color = "white",
   italic = TRUE) |> 
 style_tt(
   i = 2:14,
   j = 1:1,
   background = "#17C2AD",
   color = "white",
   italic = TRUE)
tinytable_ftyu5q15n5ibjk84ch05
Unique Missing Pct. Mean SD Min Median Max Histogram
ID 283 0 187.9 106.3 1.0 201.0 367.0
age 5 0 14.7 0.8 13.0 15.0 17.0
se_acad_p1 32 0 5.2 0.8 3.1 5.1 7.0
se_acad_p2 36 0 5.3 0.7 3.1 5.4 7.0
se_acad_p3 29 0 5.2 0.8 2.8 5.2 7.0
se_social_p1 24 0 5.3 0.8 1.8 5.4 7.0
se_social_p2 27 0 5.5 0.7 2.7 5.5 7.0
se_social_p3 31 0 5.4 0.8 3.0 5.5 7.0
sup_friends_p1 13 0 5.8 1.1 1.0 6.0 7.0
sup_friends_p2 10 0 6.0 0.9 2.5 6.0 7.0
sup_friends_p3 13 0 6.0 1.1 1.0 6.0 7.0
sup_parents_p1 11 0 6.0 1.1 1.0 6.0 7.0
sup_parents_p2 11 0 5.9 1.1 2.0 6.0 7.0
sup_parents_p3 13 0 5.7 1.1 1.0 6.0 7.0
ls_p1 15 0 5.2 0.9 2.0 5.3 7.0
ls_p2 21 0 5.8 0.7 2.5 5.8 7.0
ls_p3 161 0 5.5 1.1 1.7 5.6 9.6
ls_sum 218 0 16.5 2.2 8.2 16.7 21.0
ls_mean 217 0 5.5 0.7 2.7 5.6 7.0
N %
region east 142 50.2
west 141 49.8
gender female 132 46.6
male 151 53.4
drivers = c('se_acad_p1', 'se_acad_p2', 'se_acad_p3', 'se_social_p1', 'se_social_p2', 'se_social_p3', 'sup_friends_p1','sup_friends_p2', 'sup_friends_p3', 'sup_parents_p1' , 'sup_parents_p2' , 'sup_parents_p3', 'ls_p1', 'ls_p2', 'ls_p3')



plot_heatmap <- function(df, title="Sample Covariances", subtitle="Observed Measures") {
  heat_df = df |> as.matrix() |> melt()
  colnames(heat_df) <- c("x", "y", "value")
  g <- heat_df |> mutate(value = round(value, 2)) |>  ggplot(aes(x = x, y = y, fill = value)) +
    geom_tile() + geom_text(aes(label = value), color = "black", size = 4) +
   scale_fill_gradient2(
      high = 'dodgerblue4',
      mid = 'white',
      low = 'firebrick2'
    ) + theme(axis.text.x = element_text(angle=45)) + ggtitle(title, subtitle)
  
  g
}

g1 = plot_heatmap(cov(df[,  drivers]))

g2 = plot_heatmap(cor(df[,  drivers]), "Sample Correlations")

plot <- ggarrange(g1,g2, ncol=1, nrow=2);

Fit Initial Regression Models

formula_sum_1st = " ls_sum ~ se_acad_p1  + se_social_p1 +  sup_friends_p1  + sup_parents_p1"
formula_mean_1st = " ls_mean ~ se_acad_p1  + se_social_p1 +  sup_friends_p1  + sup_parents_p1"

formula_sum_12 = " ls_sum ~ se_acad_p1  + se_acad_p2 +  se_social_p1 + se_social_p2 + 
sup_friends_p1 + sup_friends_p2  + sup_parents_p1 + sup_parents_p2"
formula_mean_12 = " ls_mean ~ se_acad_p1  + se_acad_p2 +  se_social_p1 + se_social_p2 + 
sup_friends_p1 + sup_friends_p2  + sup_parents_p1 + sup_parents_p2"


formula_sum = " ls_sum ~ se_acad_p1 + se_acad_p2 + se_acad_p3 + se_social_p1 +  se_social_p2 + se_social_p3 +  sup_friends_p1 + sup_friends_p2 + sup_friends_p3 + sup_parents_p1 + sup_parents_p2 + sup_parents_p3"
formula_mean = " ls_mean ~ se_acad_p1 + se_acad_p2 + se_acad_p3 + se_social_p1 +  se_social_p2 + se_social_p3 +  sup_friends_p1 + sup_friends_p2 + sup_friends_p3 + sup_parents_p1 + sup_parents_p2 + sup_parents_p3"
mod_sum = lm(formula_sum, df)
mod_sum_1st = lm(formula_sum_1st, df)
mod_sum_12 = lm(formula_sum_12, df)
mod_mean = lm(formula_mean, df)
mod_mean_1st = lm(formula_mean_1st, df)
mod_mean_12 = lm(formula_mean_12, df)

min_max_norm <- function(x) {
    (x - min(x)) / (max(x) - min(x))
}

df_norm <- as.data.frame(lapply(df[c(5:19)], min_max_norm))

df_norm$ls_sum <- df$ls_sum
df_norm$ls_mean <- df$ls_mean

mod_sum_norm = lm(formula_sum, df_norm)
mod_mean_norm = lm(formula_mean, df_norm)

models = list(
    "Outcome: sum_score" = list("model_sum_1st_factors" = mod_sum_1st,
     "model_sum_1st_2nd_factors" = mod_sum_12,
     "model_sum_score" = mod_sum,
     "model_sum_score_norm" = mod_sum_norm
     ),
    "Outcome: mean_score" = list(
      "model_mean_1st_factors" = mod_mean_1st,
     "model_mean_1st_2nd_factors" = mod_mean_12,
     "model_mean_score"= mod_mean, 
     "model_mean_score_norm" = mod_mean_norm
    )
    )
#| code-fold: true
#| code-summary: "Show the code"

modelsummary(models, stars=TRUE, shape ="cbind") |> 
 style_tt(
   i = 2:25,
   j = 1:1,
   background = "#17C2AD",
   color = "white",
   italic = TRUE)
tinytable_jb9uvnwer8qg2neciem3
Outcome: sum_score Outcome: mean_score
model_sum_1st_factors model_sum_1st_2nd_factors model_sum_score model_sum_score_norm model_mean_1st_factors model_mean_1st_2nd_factors model_mean_score model_mean_score_norm
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 6.335*** 4.038*** 3.709*** 9.081*** 2.112*** 1.346*** 1.236*** 3.027***
(0.979) (1.080) (1.071) (0.739) (0.326) (0.360) (0.357) (0.246)
se_acad_p1 0.231 -0.023 -0.068 -0.264 0.077 -0.008 -0.023 -0.088
(0.158) (0.197) (0.216) (0.833) (0.053) (0.066) (0.072) (0.278)
se_social_p1 1.039*** 0.515* 0.401+ 2.085+ 0.346*** 0.172* 0.134+ 0.695+
(0.175) (0.224) (0.224) (1.167) (0.058) (0.075) (0.075) (0.389)
sup_friends_p1 0.069 -0.263+ -0.273 -1.636 0.023 -0.088+ -0.091 -0.545
(0.095) (0.145) (0.169) (1.011) (0.032) (0.048) (0.056) (0.337)
sup_parents_p1 0.518*** 0.196 0.050 0.299 0.173*** 0.065 0.017 0.100
(0.107) (0.155) (0.161) (0.964) (0.036) (0.052) (0.054) (0.321)
se_acad_p2 0.415+ 0.382+ 1.475+ 0.138+ 0.127+ 0.492+
(0.216) (0.227) (0.875) (0.072) (0.076) (0.292)
se_social_p2 0.662** 0.483+ 2.093+ 0.221** 0.161+ 0.698+
(0.233) (0.246) (1.065) (0.078) (0.082) (0.355)
sup_friends_p2 0.358* 0.366* 1.647* 0.119* 0.122* 0.549*
(0.172) (0.180) (0.808) (0.057) (0.060) (0.269)
sup_parents_p2 0.375* 0.166 0.829 0.125* 0.055 0.276
(0.151) (0.162) (0.808) (0.050) (0.054) (0.269)
se_acad_p3 -0.072 -0.302 -0.024 -0.101
(0.196) (0.815) (0.065) (0.272)
se_social_p3 0.350+ 1.400+ 0.117+ 0.467+
(0.180) (0.721) (0.060) (0.240)
sup_friends_p3 0.056 0.334 0.019 0.111
(0.146) (0.878) (0.049) (0.293)
sup_parents_p3 0.452** 2.710** 0.151** 0.903**
(0.141) (0.847) (0.047) (0.282)
Num.Obs. 283 283 283 283 283 283 283 283
R2 0.332 0.389 0.420 0.420 0.332 0.389 0.420 0.420
R2 Adj. 0.323 0.371 0.394 0.394 0.323 0.371 0.394 0.394
AIC 1134.2 1117.0 1110.3 1110.3 512.4 495.2 488.5 488.5
BIC 1156.1 1153.5 1161.3 1161.3 534.2 531.7 539.5 539.5
Log.Lik. -561.090 -548.507 -541.139 -541.139 -250.183 -237.600 -230.232 -230.232
RMSE 1.76 1.68 1.64 1.64 0.59 0.56 0.55 0.55
models = list(
     "model_sum_1st_factors" = mod_sum_1st,
     "model_sum_1st_2nd_factors" = mod_sum_12,
     "model_sum_score" = mod_sum,
     "model_sum_score_norm" = mod_sum_norm,
     "model_mean_1st_factors" = mod_mean_1st,
     "model_mean_1st_2nd_factors" = mod_mean_12,
     "model_mean_score"= mod_mean,
     "model_mean_score_norm" = mod_mean_norm
    )

modelplot(models, coef_omit = 'Intercept') + geom_vline(xintercept = 0, linetype="dotted", 
                color = "black") + ggtitle("Comparing Model Parameter Estimates", "Across Covariates")

Significant Coefficients

g1 = modelplot(mod_sum, coef_omit = 'Intercept') +  aes(color = ifelse(p.value < 0.001, "Significant", "Not significant")) +
  scale_color_manual(values = c("grey", "black"),  guide = "none")


g2 = modelplot(mod_mean, coef_omit = 'Intercept') +  aes(color = ifelse(p.value < 0.001, "Significant", "Not significant")) +
  scale_color_manual(values = c("grey", "black"))

plot <- ggarrange(g1,g2, ncol=2, nrow=1);

Aggregate Driver Scores

df$se_acad_mean <- rowMeans(df[c('se_acad_p1', 'se_acad_p2', 'se_acad_p3')])
df$se_social_mean <- rowMeans(df[c('se_social_p1', 'se_social_p2', 'se_social_p3')])
df$sup_friends_mean <- rowMeans(df[c('sup_friends_p1', 'sup_friends_p2', 'sup_friends_p3')])
df$sup_parents_mean <- rowMeans(df[c('sup_parents_p1', 'sup_parents_p2', 'sup_parents_p3')])


formula_parcel_mean = "ls_mean ~ se_acad_mean + se_social_mean + sup_friends_mean + sup_parents_mean"

formula_parcel_sum = "ls_sum ~ se_acad_mean + se_social_mean + sup_friends_mean + sup_parents_mean"

mod_sum_parcel = lm(formula_parcel_sum, df)
mod_mean_parcel = lm(formula_parcel_mean, df)

models_parcel = list("model_sum_score" = mod_sum_parcel,
     "model_mean_score"= mod_mean_parcel
     )

modelsummary(models_parcel, stars=TRUE)
tinytable_or59mjdhizio3l7k66t7
model_sum_score model_mean_score
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 4.378*** 1.459***
(1.036) (0.345)
se_acad_mean 0.252 0.084
(0.176) (0.059)
se_social_mean 1.205*** 0.402***
(0.195) (0.065)
sup_friends_mean 0.049 0.016
(0.108) (0.036)
sup_parents_mean 0.685*** 0.228***
(0.111) (0.037)
Num.Obs. 283 283
R2 0.397 0.397
R2 Adj. 0.388 0.388
AIC 1105.3 483.5
BIC 1127.1 505.3
Log.Lik. -546.638 -235.731
RMSE 1.67 0.56
g1 = modelplot(mod_sum_parcel, coef_omit = 'Intercept') +  aes(color = ifelse(p.value < 0.001, "Significant", "Not significant")) +
  scale_color_manual(values = c("grey", "black"), guide = "none")


g2 = modelplot(mod_mean_parcel, coef_omit = 'Intercept') +  aes(color = ifelse(p.value < 0.001, "Significant", "Not significant")) +
  scale_color_manual(values = c("grey", "black"))

plot <- ggarrange(g1,g2, ncol=2, nrow=1);

Hierarchical Models

formula_hierarchy_mean = "ls_mean ~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3 + sup_friends_p1 + sup_friends_p2 + sup_friends_p3 + se_acad_p1 + se_acad_p2 + se_acad_p3 +
se_social_p1 + se_social_p2 + se_social_p3  + (1 | region)"

formula_hierarchy_sum = "ls_sum ~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3 + sup_friends_p1 + sup_friends_p2 + sup_friends_p3 + se_acad_p1 + se_acad_p2 + se_acad_p3 +
se_social_p1 + se_social_p2 + se_social_p3 + (1 | region)"

hierarchical_mean_fit <- lmer(formula_hierarchy_mean, data = df, REML = TRUE)
hierarchical_sum_fit <- lmer(formula_hierarchy_sum, data = df, REML = TRUE)

g1 = modelplot(hierarchical_mean_fit, re.form=NA) +  aes(color = ifelse(p.value < 0.001, "Significant", "Not significant")) +
  scale_color_manual(values = c("grey", "black"),  guide = "none")

g2 = modelplot(hierarchical_sum_fit, re.form=NA) +  aes(color = ifelse(p.value < 0.001, "Significant", "Not significant")) +
  scale_color_manual(values = c("grey", "black"))


plot <- ggarrange(g1,g2, ncol=2, nrow=1);
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).

modelsummary(list("hierarchical_mean_fit"= hierarchical_mean_fit,
                  "hierarchical_sum_fit"= hierarchical_sum_fit), 
             stars = TRUE) |> 
 style_tt(
   i = 2:25,
   j = 1:1,
   background = "#17C2AD",
   color = "white",
   italic = TRUE)
tinytable_ftl5t6kstw3adyosicbz
hierarchical_mean_fit hierarchical_sum_fit
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.903* 2.710*
(0.385) (1.154)
sup_parents_p1 -0.007 -0.021
(0.053) (0.160)
sup_parents_p2 0.070 0.210
(0.053) (0.159)
sup_parents_p3 0.160*** 0.480***
(0.046) (0.139)
sup_friends_p1 -0.091+ -0.274+
(0.055) (0.166)
sup_friends_p2 0.125* 0.376*
(0.059) (0.177)
sup_friends_p3 0.024 0.072
(0.048) (0.144)
se_acad_p1 -0.041 -0.122
(0.071) (0.213)
se_acad_p2 0.131+ 0.393+
(0.074) (0.223)
se_acad_p3 0.039 0.116
(0.067) (0.202)
se_social_p1 0.094 0.283
(0.075) (0.224)
se_social_p2 0.191* 0.574*
(0.081) (0.244)
se_social_p3 0.130* 0.389*
(0.059) (0.178)
SD (Intercept region) 0.160 0.481
SD (Observations) 0.549 1.648
Num.Obs. 283 283
R2 Marg. 0.424 0.424
R2 Cond. 0.469 0.469
AIC 538.4 1131.6
BIC 593.1 1186.3
ICC 0.1 0.1
RMSE 0.54 1.61

Marginal Effects

pred <- predictions(hierarchical_mean_fit, newdata = datagrid(sup_parents_p3 = 1:10, region=c('east', 'west')), re.form=NA)
pred1 <- predictions(mod_mean, newdata = datagrid(sup_parents_p2 = 1:10))

pred1 |> head() |> kableExtra::kable()
rowid estimate std.error statistic p.value s.value conf.low conf.high se_acad_p1 se_acad_p2 se_acad_p3 se_social_p1 se_social_p2 se_social_p3 sup_friends_p1 sup_friends_p2 sup_friends_p3 sup_parents_p1 sup_parents_p3 sup_parents_p2 ls_mean
1 5.232276 0.2673967 19.56747 0 280.8136 4.708188 5.756363 5.154955 5.346853 5.211248 5.290047 5.477267 5.440989 5.786219 6.010601 5.991166 5.975265 5.719081 1 6.576815
2 5.287553 0.2140733 24.69973 0 445.0319 4.867977 5.707128 5.154955 5.346853 5.211248 5.290047 5.477267 5.440989 5.786219 6.010601 5.991166 5.975265 5.719081 2 6.576815
3 5.342829 0.1610972 33.16525 0 798.8130 5.027085 5.658574 5.154955 5.346853 5.211248 5.290047 5.477267 5.440989 5.786219 6.010601 5.991166 5.975265 5.719081 3 6.576815
4 5.398106 0.1089765 49.53461 0 Inf 5.184516 5.611696 5.154955 5.346853 5.211248 5.290047 5.477267 5.440989 5.786219 6.010601 5.991166 5.975265 5.719081 4 6.576815
5 5.453383 0.0599835 90.91472 0 Inf 5.335818 5.570949 5.154955 5.346853 5.211248 5.290047 5.477267 5.440989 5.786219 6.010601 5.991166 5.975265 5.719081 5 6.576815
6 5.508660 0.0334480 164.69327 0 Inf 5.443103 5.574217 5.154955 5.346853 5.211248 5.290047 5.477267 5.440989 5.786219 6.010601 5.991166 5.975265 5.719081 6 6.576815

Regression Marginal Effects

g = plot_predictions(mod_mean, condition = c("sup_parents_p3"), type = "response") + ggtitle("Counterfactual Shift of Outcome: sup_parents_p3", "Holding all else Fixed")

g1 = plot_predictions(mod_mean, condition = c("sup_friends_p1"), type = "response") + ggtitle("Counterfactual Shift of Outcome: sup_friends_p1", "Holding all else Fixed")

g2 = plot_predictions(mod_mean, condition = c("se_acad_p1"), 
                      type = "response") + ggtitle("Counterfactual Shift of Outcome: se_acad_p1", "Holding all else Fixed")

plot <- ggarrange(g,g1,g2, ncol=1, nrow=3);

Hierarchical Marginal Effects

g = plot_predictions(hierarchical_mean_fit, condition = c("sup_parents_p3", "region"), type = "response", re.form=NA) + ggtitle("Counterfactual Shift of Outcome: sup_parents_p3", "Holding all else Fixed")

g1 = plot_predictions(hierarchical_mean_fit, condition = c("sup_friends_p1", "region"), type = "response", re.form=NA) + ggtitle("Counterfactual Shift of Outcome: sup_friends_p1", "Holding all else Fixed")

g2 = plot_predictions(hierarchical_mean_fit, condition = c("se_acad_p1", "region"), 
                      type = "response", re.form=NA) + ggtitle("Counterfactual Shift of Outcome: se_acad_p1", "Holding all else Fixed")

plot <- ggarrange(g,g1,g2, ncol=1, nrow=3);

Confirmatory Factor Analysis

model_measurement <- "
# Measurement model
SUP_Parents =~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3
SUP_Friends =~ sup_friends_p1 + sup_friends_p2 + sup_friends_p3
SE_Academic =~ se_acad_p1 + se_acad_p2 + se_acad_p3
SE_Social =~ se_social_p1 + se_social_p2 + se_social_p3
LS  =~ ls_p1 + ls_p2 + ls_p3
"

model_measurement1 <- "
# Measurement model
SUP_Parents =~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3
SUP_Friends =~ a1*sup_friends_p1 + a2*sup_friends_p2 + a3*sup_friends_p3
SE_Academic =~ se_acad_p1 + se_acad_p2 + se_acad_p3
SE_Social =~ se_social_p1 + se_social_p2 + se_social_p3
LS  =~ ls_p1 + ls_p2 + ls_p3

a1 == a2 
a1 == a3
"


fit_mod <- cfa(model_measurement, data = df)
fit_mod_1<- cfa(model_measurement1, data = df)


cfa_models = list("full_measurement_model" = fit_mod, 
     "measurement_model_reduced" = fit_mod_1)
modelplot(cfa_models)

summary(fit_mod, fit.measures = TRUE, standardized = TRUE) 
lavaan 0.6-18 ended normally after 46 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        40

  Number of observations                           283

Model Test User Model:
                                                      
  Test statistic                               207.137
  Degrees of freedom                                80
  P-value (Chi-square)                           0.000

Model Test Baseline Model:

  Test statistic                              2586.651
  Degrees of freedom                               105
  P-value                                        0.000

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.949
  Tucker-Lewis Index (TLI)                       0.933

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)              -4394.212
  Loglikelihood unrestricted model (H1)      -4290.643
                                                      
  Akaike (AIC)                                8868.423
  Bayesian (BIC)                              9014.241
  Sample-size adjusted Bayesian (SABIC)       8887.401

Root Mean Square Error of Approximation:

  RMSEA                                          0.075
  90 Percent confidence interval - lower         0.062
  90 Percent confidence interval - upper         0.088
  P-value H_0: RMSEA <= 0.050                    0.001
  P-value H_0: RMSEA >= 0.080                    0.263

Standardized Root Mean Square Residual:

  SRMR                                           0.056

Parameter Estimates:

  Standard errors                             Standard
  Information                                 Expected
  Information saturated (h1) model          Structured

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  SUP_Parents =~                                                        
    sup_parents_p1    1.000                               0.938    0.876
    sup_parents_p2    1.034    0.056   18.570    0.000    0.970    0.888
    sup_parents_p3    0.988    0.059   16.637    0.000    0.927    0.811
  SUP_Friends =~                                                        
    sup_friends_p1    1.000                               1.021    0.906
    sup_friends_p2    0.793    0.043   18.466    0.000    0.809    0.857
    sup_friends_p3    0.891    0.050   17.735    0.000    0.910    0.831
  SE_Academic =~                                                        
    se_acad_p1        1.000                               0.697    0.880
    se_acad_p2        0.806    0.049   16.278    0.000    0.561    0.819
    se_acad_p3        0.952    0.058   16.491    0.000    0.663    0.828
  SE_Social =~                                                          
    se_social_p1      1.000                               0.640    0.846
    se_social_p2      0.959    0.055   17.338    0.000    0.614    0.881
    se_social_p3      0.926    0.066   13.956    0.000    0.593    0.742
  LS =~                                                                 
    ls_p1             1.000                               0.677    0.729
    ls_p2             0.820    0.078   10.488    0.000    0.555    0.762
    ls_p3             0.744    0.109    6.809    0.000    0.504    0.459

Covariances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  SUP_Parents ~~                                                        
    SUP_Friends       0.134    0.064    2.094    0.036    0.140    0.140
    SE_Academic       0.219    0.046    4.717    0.000    0.335    0.335
    SE_Social         0.284    0.046    6.239    0.000    0.473    0.473
    LS                0.360    0.056    6.455    0.000    0.567    0.567
  SUP_Friends ~~                                                        
    SE_Academic       0.071    0.047    1.493    0.135    0.100    0.100
    SE_Social         0.195    0.046    4.262    0.000    0.299    0.299
    LS                0.184    0.053    3.500    0.000    0.267    0.267
  SE_Academic ~~                                                        
    SE_Social         0.274    0.036    7.525    0.000    0.613    0.613
    LS                0.212    0.039    5.407    0.000    0.449    0.449
  SE_Social ~~                                                          
    LS                0.330    0.043    7.650    0.000    0.761    0.761

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .sup_parents_p1    0.268    0.037    7.143    0.000    0.268    0.233
   .sup_parents_p2    0.253    0.038    6.595    0.000    0.253    0.212
   .sup_parents_p3    0.446    0.048    9.262    0.000    0.446    0.342
   .sup_friends_p1    0.228    0.040    5.663    0.000    0.228    0.179
   .sup_friends_p2    0.237    0.030    7.926    0.000    0.237    0.266
   .sup_friends_p3    0.371    0.042    8.811    0.000    0.371    0.310
   .se_acad_p1        0.141    0.022    6.487    0.000    0.141    0.226
   .se_acad_p2        0.154    0.018    8.648    0.000    0.154    0.329
   .se_acad_p3        0.202    0.024    8.397    0.000    0.202    0.315
   .se_social_p1      0.163    0.020    8.095    0.000    0.163    0.284
   .se_social_p2      0.109    0.016    6.782    0.000    0.109    0.224
   .se_social_p3      0.287    0.028   10.141    0.000    0.287    0.450
   .ls_p1             0.404    0.048    8.422    0.000    0.404    0.469
   .ls_p2             0.223    0.029    7.624    0.000    0.223    0.420
   .ls_p3             0.951    0.085   11.123    0.000    0.951    0.789
    SUP_Parents       0.880    0.098    8.937    0.000    1.000    1.000
    SUP_Friends       1.042    0.111    9.405    0.000    1.000    1.000
    SE_Academic       0.485    0.054    8.908    0.000    1.000    1.000
    SE_Social         0.410    0.048    8.459    0.000    1.000    1.000
    LS                0.458    0.072    6.323    0.000    1.000    1.000
g1 = plot_heatmap(cov(df[,  drivers]))

g2 = plot_heatmap(data.frame(fitted(fit_mod)$cov), title="Model Implied Covariances", "Fitted Values")

plot <- ggarrange(g1,g2, ncol=1, nrow=2);

Modification Indices

modindices(fit_mod, sort = TRUE, maximum.number = 6, standardized = FALSE, minimum.value = 5)
               lhs op            rhs     mi    epc
152 sup_friends_p1 ~~   se_social_p3 19.245  0.095
68     SUP_Friends =~          ls_p2 17.491  0.181
64     SUP_Friends =~   se_social_p1 17.210 -0.136
60     SUP_Friends =~ sup_parents_p3 16.063 -0.187
210          ls_p2 ~~          ls_p3 13.931 -0.140
57     SUP_Parents =~          ls_p3 13.057  0.329

Summary Global Fit Measures

summary_df = cbind(fitMeasures(fit_mod, c("chisq", "baseline.chisq", "cfi", "aic", "bic", "rmsea","srmr")),
      fitMeasures(fit_mod_1, c("chisq", "baseline.chisq", "cfi", "aic", "bic", "rmsea","srmr")))
colnames(summary_df) = c('Full Model', 'Reduced Model')

summary_df
                  Full Model Reduced Model
chisq           207.13746909  225.18095274
baseline.chisq 2586.65112811 2586.65112811
cfi               0.94876900    0.94230416
aic            8868.42313715 8882.46662079
bic            9014.24101305 9020.99360290
rmsea             0.07493739    0.07854933
srmr              0.05595908    0.05904842
semPlot::semPaths(fit_mod, whatLabels = 'est', intercepts = FALSE)

semPlot::semPaths(fit_mod_1, whatLabels = 'std', intercepts = FALSE)

Comparing the empirical and implied variance-covariance matrix

heat_df = data.frame(resid(fit_mod, type = "standardized")$cov) 
heat_df = heat_df |> as.matrix() |> melt()
colnames(heat_df) <- c("x", "y", "value")

g1 = heat_df  |> mutate(value = round(value, 2)) |>  ggplot(aes(x = x, y = y, fill = value)) +
  geom_tile() + geom_text(aes(label = value), color = "black", size = 4) +
 scale_fill_gradient2(
    high = 'dodgerblue4',
    mid = 'white',
    low = 'firebrick2'
  ) + theme(axis.text.x = element_text(angle=45)) + ggtitle("Residuals of the Sample Covariances and Model Implied Covariances", "A Visual Check of Model fit: Full Model")

heat_df = data.frame(resid(fit_mod_1, type = "standardized")$cov) 
heat_df = heat_df |> as.matrix() |> melt()
colnames(heat_df) <- c("x", "y", "value")

g2 = heat_df  |> mutate(value = round(value, 2)) |>  ggplot(aes(x = x, y = y, fill = value)) +
  geom_tile() + geom_text(aes(label = value), color = "black", size = 4) +
 scale_fill_gradient2(
    high = 'dodgerblue4',
    mid = 'white',
    low = 'firebrick2'
  ) + theme(axis.text.x = element_text(angle=45)) + ggtitle("Residuals of the Sample Covariances and Model Implied Covariances", "A Visual Check of Model fit: Reduced Model")

plot <- ggarrange(g1,g2, ncol=1, nrow=2);

anova(fit_mod)
Chi-Squared Test Statistic (unscaled)

          Df    AIC    BIC  Chisq Chisq diff Df diff         Pr(>Chisq)    
Saturated  0                 0.00                                          
Model     80 8868.4 9014.2 207.14     207.14      80 0.0000000000003209 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit_mod_1)
Chi-Squared Test Statistic (unscaled)

          Df    AIC  BIC  Chisq Chisq diff Df diff           Pr(>Chisq)    
Saturated  0               0.00                                            
Model     82 8882.5 9021 225.18     225.18      82 0.000000000000002776 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit_mod, fit_mod_1)

Chi-Squared Difference Test

          Df    AIC    BIC  Chisq Chisq diff   RMSEA Df diff Pr(>Chisq)    
fit_mod   80 8868.4 9014.2 207.14                                          
fit_mod_1 82 8882.5 9021.0 225.18     18.044 0.16836       2  0.0001208 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Structural Equation Models

model <- "
# Measurement model
SUP_Parents =~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3
SUP_Friends =~ sup_friends_p1 + sup_friends_p2 + sup_friends_p3
SE_Academic =~ se_acad_p1 + se_acad_p2 + se_acad_p3
SE_Social =~ se_social_p1 + se_social_p2 + se_social_p3
LS  =~ ls_p1 + ls_p2 + ls_p3

# Structural model 
# Regressions
LS ~ SE_Academic + SE_Social + SUP_Parents + SUP_Friends

# Residual covariances
SE_Academic ~~ SE_Social
"

fit_mod_sem <- sem(model, data = df)
modelplot(fit_mod_sem)

semPlot::semPaths(fit_mod_sem, whatLabels = 'std', intercepts = FALSE)

heat_df = data.frame(resid(fit_mod_sem, type = "standardized")$cov) 
heat_df = heat_df |> as.matrix() |> melt()
colnames(heat_df) <- c("x", "y", "value")

heat_df  |> mutate(value = round(value, 2)) |>  ggplot(aes(x = x, y = y, fill = value)) +
  geom_tile() + geom_text(aes(label = value), color = "black", size = 4) +
 scale_fill_gradient2(
    high = 'dodgerblue4',
    mid = 'white',
    low = 'firebrick2'
  ) + theme(axis.text.x = element_text(angle=45)) + ggtitle("Residuals of the Sample Covariances and Model Implied Covariances", "A Visual Check of Model fit")

Confirmatory Factor Models with PyMC

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pytensor import tensor as pt
import arviz as az
import networkx as nx



df_p = pd.read_csv('IIS.dat', sep='\s+')
df_p.head() 
     PI    AD   IGC   FI   FC
0  4.00  3.38  4.67  2.6  3.2
1  2.57  3.00  3.50  2.4  2.8
2  2.29  3.29  4.83  2.0  3.4
3  2.43  3.63  4.33  3.6  3.8
4  3.00  4.00  4.83  3.4  3.8
coords = {'obs': list(range(len(df_p))), 
          'indicators': ['PI', 'AD',    'IGC', 'FI', 'FC'],
          'indicators_1': ['PI', 'AD',  'IGC'],
          'indicators_2': ['FI', 'FC'],
          'latent': ['Student', 'Faculty']
          }


obs_idx = list(range(len(df_p)))
with pm.Model(coords=coords) as model:
  
  Psi = pm.InverseGamma('Psi', 5, 10, dims='indicators')
  lambdas_ = pm.Normal('lambdas_1', 1, 10, dims=('indicators_1'))
  lambdas_1 = pm.Deterministic('lambdas1', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_1'))
  lambdas_ = pm.Normal('lambdas_2', 1, 10, dims=('indicators_2'))
  lambdas_2 = pm.Deterministic('lambdas2', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_2'))
  tau = pm.Normal('tau', 3, 10, dims='indicators')
  kappa = 0
  sd_dist = pm.Exponential.dist(1.0, shape=2)
  chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=2, eta=2,
    sd_dist=sd_dist, compute_corr=True)
  ksi = pm.MvNormal('ksi', kappa, chol=chol, dims=('obs', 'latent'))

  m1 = tau[0] + ksi[obs_idx, 0]*lambdas_1[0]
  m2 = tau[1] + ksi[obs_idx, 0]*lambdas_1[1]
  m3 = tau[2] + ksi[obs_idx, 0]*lambdas_1[2]
  m4 = tau[3] + ksi[obs_idx, 1]*lambdas_2[0]
  m5 = tau[4] + ksi[obs_idx, 1]*lambdas_2[1]
  
  mu = pm.Deterministic('mu', pm.math.stack([m1, m2, m3, m4, m5]).T)
  _  = pm.Normal('likelihood', mu, Psi, observed=df_p.values)

  idata = pm.sample(nuts_sampler='numpyro', target_accept=.95, 
                    idata_kwargs={"log_likelihood": True})
  idata.extend(pm.sample_posterior_predictive(idata))
  
summary_df = az.summary(idata, var_names=['lambdas1', 'lambdas2', 'tau', 'Psi', 'ksi', 'chol_cov_corr'], coords= {'obs': [0, 7]})

PyMC Confirmatory Factor Model

py$summary_df |> kable() |>  kable_classic(full_width = F, html_font = "Cambria")
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
lambdas1[PI] 1.000 0.000 1.000 1.000 0.000 0.000 4000 4000 NaN
lambdas1[AD] 0.901 0.064 0.777 1.013 0.004 0.003 290 488 1.01
lambdas1[IGC] 0.537 0.045 0.458 0.626 0.002 0.002 402 781 1.01
lambdas2[FI] 1.000 0.000 1.000 1.000 0.000 0.000 4000 4000 NaN
lambdas2[FC] 0.979 0.057 0.873 1.083 0.002 0.002 570 1146 1.01
tau[PI] 3.333 0.038 3.258 3.400 0.002 0.001 551 1419 1.00
tau[AD] 3.897 0.028 3.846 3.948 0.001 0.001 372 917 1.01
tau[IGC] 4.595 0.022 4.553 4.634 0.001 0.001 599 1276 1.01
tau[FI] 3.033 0.040 2.957 3.107 0.002 0.001 471 1318 1.00
tau[FC] 3.712 0.036 3.648 3.779 0.002 0.001 392 827 1.01
Psi[PI] 0.609 0.024 0.564 0.656 0.001 0.001 936 1461 1.00
Psi[AD] 0.317 0.020 0.280 0.355 0.001 0.001 704 1225 1.00
Psi[IGC] 0.355 0.013 0.329 0.378 0.000 0.000 2342 2727 1.00
Psi[FI] 0.570 0.025 0.522 0.616 0.001 0.001 1131 1828 1.00
Psi[FC] 0.421 0.025 0.372 0.465 0.001 0.001 793 1740 1.00
ksi[0, Student] -0.223 0.228 -0.637 0.226 0.004 0.003 3564 2839 1.00
ksi[0, Faculty] -0.366 0.277 -0.876 0.165 0.005 0.004 3321 2548 1.00
ksi[7, Student] 0.887 0.224 0.481 1.318 0.004 0.003 2977 2990 1.00
ksi[7, Faculty] 0.872 0.271 0.351 1.357 0.004 0.003 3841 3006 1.00
chol_cov_corr[0, 0] 1.000 0.000 1.000 1.000 0.000 0.000 4000 4000 NaN
chol_cov_corr[0, 1] 0.850 0.028 0.799 0.902 0.001 0.001 452 814 1.01
chol_cov_corr[1, 0] 0.850 0.028 0.799 0.902 0.001 0.001 452 814 1.01
chol_cov_corr[1, 1] 1.000 0.000 1.000 1.000 0.000 0.000 4146 4000 1.00
az.plot_trace(idata, var_names=['lambdas1', 'lambdas2', 'tau', 'Psi', 'ksi']);
/Users/nathanielforde/mambaforge/envs/pymc_causal/lib/python3.11/site-packages/arviz/stats/density_utils.py:487: UserWarning: Your data appears to have a single value or no finite values
  warnings.warn("Your data appears to have a single value or no finite values")
/Users/nathanielforde/mambaforge/envs/pymc_causal/lib/python3.11/site-packages/arviz/stats/density_utils.py:487: UserWarning: Your data appears to have a single value or no finite values
  warnings.warn("Your data appears to have a single value or no finite values")

df = pd.read_csv('sem_data.csv')
drivers = ['se_acad_p1', 'se_acad_p2',
       'se_acad_p3', 'se_social_p1', 'se_social_p2', 'se_social_p3',
       'sup_friends_p1', 'sup_friends_p2', 'sup_friends_p3', 'sup_parents_p1',
       'sup_parents_p2', 'sup_parents_p3', 'ls_p1', 'ls_p2', 'ls_p3']
       
       
coords = {'obs': list(range(len(df))), 
          'indicators': drivers,
          'indicators_1': ['se_acad_p1','se_acad_p2','se_acad_p3'],
          'indicators_2': ['se_social_p1','se_social_p2','se_social_p3'],
          'indicators_3': ['sup_friends_p1','sup_friends_p2','sup_friends_p3'],
          'indicators_4': [ 'sup_parents_p1','sup_parents_p2','sup_parents_p3'],
          'indicators_5': ['ls_p1','ls_p2', 'ls_p3'],
          'latent': ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS'],
          'latent1': ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']
          }

obs_idx = list(range(len(df)))
with pm.Model(coords=coords) as model:
  
  Psi = pm.InverseGamma('Psi', 5, 10, dims='indicators')
  lambdas_ = pm.Normal('lambdas_1', 1, 10, dims=('indicators_1'))
  lambdas_1 = pm.Deterministic('lambdas1', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_1'))
  lambdas_ = pm.Normal('lambdas_2', 1, 10, dims=('indicators_2'))
  lambdas_2 = pm.Deterministic('lambdas2', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_2'))
  lambdas_ = pm.Normal('lambdas_3', 1, 10, dims=('indicators_3'))
  lambdas_3 = pm.Deterministic('lambdas3', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_3'))
  lambdas_ = pm.Normal('lambdas_4', 1, 10, dims=('indicators_4'))
  lambdas_4 = pm.Deterministic('lambdas4', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_4'))
  lambdas_ = pm.Normal('lambdas_5', 1, 10, dims=('indicators_5'))
  lambdas_5 = pm.Deterministic('lambdas5', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_5'))
  tau = pm.Normal('tau', 3, 10, dims='indicators')
  kappa = 0
  sd_dist = pm.Exponential.dist(1.0, shape=5)
  chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=5, eta=2,
    sd_dist=sd_dist, compute_corr=True)
  cov = pm.Deterministic("cov", chol.dot(chol.T), dims=('latent', 'latent1'))
  ksi = pm.MvNormal('ksi', kappa, chol=chol, dims=('obs', 'latent'))

  m0 = tau[0] + ksi[obs_idx, 0]*lambdas_1[0]
  m1 = tau[1] + ksi[obs_idx, 0]*lambdas_1[1]
  m2 = tau[2] + ksi[obs_idx, 0]*lambdas_1[2]
  m3 = tau[3] + ksi[obs_idx, 1]*lambdas_2[0]
  m4 = tau[4] + ksi[obs_idx, 1]*lambdas_2[1]
  m5 = tau[5] + ksi[obs_idx, 1]*lambdas_2[2]
  m6 = tau[6] + ksi[obs_idx, 2]*lambdas_3[0]
  m7 = tau[7] + ksi[obs_idx, 2]*lambdas_3[1]
  m8 = tau[8] + ksi[obs_idx, 2]*lambdas_3[2]
  m9 = tau[9] + ksi[obs_idx, 3]*lambdas_4[0]
  m10 = tau[10] + ksi[obs_idx, 3]*lambdas_4[1]
  m11 = tau[11] + ksi[obs_idx, 3]*lambdas_4[2]
  m12 = tau[12] + ksi[obs_idx, 4]*lambdas_5[0]
  m13 = tau[13] + ksi[obs_idx, 4]*lambdas_5[1]
  m14 = tau[14] + ksi[obs_idx, 4]*lambdas_5[2]
  
  mu = pm.Deterministic('mu', pm.math.stack([m0, m1, m2, m3, m4, m5, m6, m7,
                                             m8, m9, m10, m11, m12, m13, m14]).T)
  _  = pm.Normal('likelihood', mu, Psi, observed=df[drivers].values)

  idata = pm.sample(nuts_sampler='numpyro', target_accept=.95, 
                    idata_kwargs={"log_likelihood": True})
  idata.extend(pm.sample_posterior_predictive(idata))
  
summary_df1 = az.summary(idata, var_names=['lambdas1', 'lambdas2', 'lambdas3', 'lambdas4', 'lambdas5', 'tau', 'Psi'])

cov_df = pd.DataFrame(az.extract(idata['posterior'])['cov'].mean(axis=2))
cov_df.index = ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']
cov_df.columns = ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']

correlation_df = pd.DataFrame(az.extract(idata['posterior'])['chol_cov_corr'].mean(axis=2))
correlation_df.index = ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']
correlation_df.columns = ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']

Life Satisfaction Model

py$summary_df1 |> kable() |>  kable_classic(full_width = F, html_font = "Cambria")
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
lambdas1[se_acad_p1] 1.000 0.000 1.000 1.000 0.000 0.000 4000 4000 NaN
lambdas1[se_acad_p2] 0.818 0.052 0.719 0.912 0.002 0.001 914 1405 1.00
lambdas1[se_acad_p3] 0.969 0.062 0.856 1.084 0.002 0.001 975 1893 1.00
lambdas2[se_social_p1] 1.000 0.000 1.000 1.000 0.000 0.000 4000 4000 NaN
lambdas2[se_social_p2] 0.962 0.060 0.850 1.069 0.002 0.002 776 1556 1.00
lambdas2[se_social_p3] 0.939 0.075 0.799 1.081 0.002 0.002 1030 1663 1.00
lambdas3[sup_friends_p1] 1.000 0.000 1.000 1.000 0.000 0.000 4000 4000 NaN
lambdas3[sup_friends_p2] 0.800 0.044 0.717 0.880 0.001 0.001 910 1568 1.00
lambdas3[sup_friends_p3] 0.903 0.053 0.804 1.005 0.002 0.001 897 1076 1.00
lambdas4[sup_parents_p1] 1.000 0.000 1.000 1.000 0.000 0.000 4000 4000 NaN
lambdas4[sup_parents_p2] 1.042 0.056 0.939 1.146 0.002 0.001 797 2227 1.00
lambdas4[sup_parents_p3] 1.010 0.064 0.891 1.128 0.002 0.001 1026 1495 1.00
lambdas5[ls_p1] 1.000 0.000 1.000 1.000 0.000 0.000 4000 4000 NaN
lambdas5[ls_p2] 0.790 0.080 0.646 0.948 0.003 0.002 501 1480 1.01
lambdas5[ls_p3] 0.989 0.100 0.810 1.179 0.004 0.003 522 901 1.01
tau[se_acad_p1] 5.158 0.047 5.073 5.247 0.003 0.002 350 880 1.01
tau[se_acad_p2] 5.349 0.041 5.273 5.425 0.002 0.001 405 1014 1.01
tau[se_acad_p3] 5.213 0.048 5.125 5.304 0.002 0.002 418 967 1.00
tau[se_social_p1] 5.293 0.044 5.212 5.378 0.003 0.002 289 770 1.01
tau[se_social_p2] 5.481 0.042 5.406 5.557 0.003 0.002 253 855 1.02
tau[se_social_p3] 5.444 0.048 5.356 5.538 0.002 0.002 369 1082 1.01
tau[sup_friends_p1] 5.791 0.069 5.663 5.920 0.004 0.003 369 870 1.01
tau[sup_friends_p2] 6.014 0.058 5.907 6.123 0.003 0.002 410 1003 1.01
tau[sup_friends_p3] 5.994 0.067 5.868 6.123 0.003 0.002 430 1111 1.01
tau[sup_parents_p1] 5.978 0.066 5.862 6.105 0.004 0.003 275 777 1.01
tau[sup_parents_p2] 5.931 0.067 5.809 6.057 0.004 0.003 273 827 1.01
tau[sup_parents_p3] 5.723 0.070 5.593 5.853 0.004 0.003 318 927 1.01
tau[ls_p1] 5.196 0.056 5.085 5.296 0.003 0.002 373 1146 1.01
tau[ls_p2] 5.780 0.043 5.702 5.863 0.002 0.002 354 1015 1.01
tau[ls_p3] 5.227 0.054 5.130 5.332 0.003 0.002 346 914 1.01
Psi[se_acad_p1] 0.411 0.028 0.361 0.465 0.001 0.001 1279 1972 1.01
Psi[se_acad_p2] 0.414 0.024 0.369 0.458 0.001 0.000 2207 2824 1.00
Psi[se_acad_p3] 0.467 0.028 0.417 0.519 0.001 0.000 1821 2478 1.00
Psi[se_social_p1] 0.430 0.026 0.381 0.479 0.001 0.000 1439 1938 1.00
Psi[se_social_p2] 0.362 0.024 0.317 0.409 0.001 0.000 1628 1917 1.00
Psi[se_social_p3] 0.553 0.028 0.501 0.605 0.001 0.000 2789 2722 1.00
Psi[sup_friends_p1] 0.514 0.038 0.444 0.587 0.001 0.001 942 1765 1.00
Psi[sup_friends_p2] 0.509 0.032 0.448 0.566 0.001 0.001 1660 2044 1.00
Psi[sup_friends_p3] 0.625 0.036 0.557 0.694 0.001 0.001 2178 2691 1.00
Psi[sup_parents_p1] 0.551 0.034 0.486 0.614 0.001 0.001 1452 2292 1.00
Psi[sup_parents_p2] 0.533 0.037 0.468 0.606 0.001 0.001 1297 1841 1.00
Psi[sup_parents_p3] 0.676 0.038 0.601 0.747 0.001 0.001 1900 2372 1.00
Psi[ls_p1] 0.671 0.037 0.604 0.743 0.001 0.001 1342 2088 1.00
Psi[ls_p2] 0.534 0.030 0.478 0.591 0.001 0.001 1638 2169 1.00
Psi[ls_p3] 0.621 0.035 0.557 0.689 0.001 0.001 1664 2422 1.00

fig, ax = plt.subplots(figsize=(15, 8))
az.plot_forest(idata, var_names=['lambdas1', 'lambdas2', 'lambdas3', 'lambdas4', 'lambdas5'], combined=True, ax=ax);
ax.set_title("Factor Loadings for each of the Five Factors");

py$cov_df |> kable(caption= "Covariances Amongst Latent Factors",digits=2) |> kable_styling() %>% kable_classic(full_width = F, html_font = "Cambria") |> row_spec(5, color = "red")
Covariances Amongst Latent Factors
SE_ACAD SE_SOCIAL SUP_F SUP_P LS
SE_ACAD 0.47 0.26 0.06 0.20 0.22
SE_SOCIAL 0.26 0.40 0.19 0.27 0.30
SUP_F 0.06 0.19 1.04 0.12 0.16
SUP_P 0.20 0.27 0.12 0.86 0.39
LS 0.22 0.30 0.16 0.39 0.42
py$correlation_df |> kable( caption= "Correlations Amongst Latent Factors", digits=2) |> kable_styling() |>  kable_classic(full_width = F, html_font = "Cambria") |> row_spec(5, color = "red")
Correlations Amongst Latent Factors
SE_ACAD SE_SOCIAL SUP_F SUP_P LS
SE_ACAD 1.00 0.60 0.09 0.32 0.50
SE_SOCIAL 0.60 1.00 0.29 0.46 0.75
SUP_F 0.09 0.29 1.00 0.13 0.25
SUP_P 0.32 0.46 0.13 1.00 0.64
LS 0.50 0.75 0.25 0.64 1.00

fig, axs = plt.subplots(5, 3, figsize=(20, 20))
axs = axs.flatten()
for i in range(15):
    temp = idata['posterior_predictive'].sel({'likelihood_dim_3': i}).mean(dim=('chain', 'draw'))
    axs[i].hist(df[drivers[i]], alpha=0.3, ec='black', bins=20, label='Observed Scores')
    axs[i].hist(temp['likelihood'], color='purple', alpha=0.6, bins=20, label='Predicted Scores')
    axs[i].set_title(f"Posterior Predictive Checks {drivers[i]}")
    axs[i].legend();

model_cov = pd.DataFrame(az.extract(idata['posterior_predictive'])['likelihood'][:, :, 0]).cov()
obs_cov = df[drivers].cov()
model_cov.index = obs_cov.index
model_cov.columns = obs_cov.columns
residuals = model_cov - obs_cov
plot_heatmap(py$residuals, "Residuals of the Sample Covariances and Model Implied Covariances", "A Visual Check of Bayesian Model fit")

Structural Equation Modelling in PyMC

def make_sem(priors): 

  coords = {'obs': list(range(len(df))), 
            'indicators': drivers,
            'indicators_1': ['se_acad_p1','se_acad_p2','se_acad_p3'],
            'indicators_2': ['se_social_p1','se_social_p2','se_social_p3'],
            'indicators_3': ['sup_friends_p1','sup_friends_p2','sup_friends_p3'],
            'indicators_4': [ 'sup_parents_p1','sup_parents_p2','sup_parents_p3'],
            'indicators_5': ['ls_p1','ls_p2', 'ls_p3'],
            'latent': ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P'], 
            'latent1': ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P']
            }

  obs_idx = list(range(len(df)))
  with pm.Model(coords=coords) as model:
    
    Psi = pm.InverseGamma('Psi', 5, 10, dims='indicators')
    lambdas_ = pm.Normal('lambdas_1',  priors['lambda'][0], priors['lambda'][1], dims=('indicators_1'))
    lambdas_1 = pm.Deterministic('lambdas1', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_1'))
    lambdas_ = pm.Normal('lambdas_2', priors['lambda'][0], priors['lambda'][1], dims=('indicators_2'))
    lambdas_2 = pm.Deterministic('lambdas2', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_2'))
    lambdas_ = pm.Normal('lambdas_3', priors['lambda'][0], priors['lambda'][1], dims=('indicators_3'))
    lambdas_3 = pm.Deterministic('lambdas3', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_3'))
    lambdas_ = pm.Normal('lambdas_4', priors['lambda'][0], priors['lambda'][1], dims=('indicators_4'))
    lambdas_4 = pm.Deterministic('lambdas4', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_4'))
    lambdas_ = pm.Normal('lambdas_5', priors['lambda'][0], priors['lambda'][1], dims=('indicators_5'))
    lambdas_5 = pm.Deterministic('lambdas5', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_5'))
    tau = pm.Normal('tau', 3, 10, dims='indicators')
    kappa = 0
    sd_dist = pm.Exponential.dist(1.0, shape=4)
    chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=4, eta=priors['eta'],
      sd_dist=sd_dist, compute_corr=True)
    cov = pm.Deterministic("cov", chol.dot(chol.T), dims=('latent', 'latent1'))
    ksi = pm.MvNormal('ksi', kappa, chol=chol, dims=('obs', 'latent'))

    beta_r = pm.Normal('beta_r', 0, .5, dims='latent')
    regression = pm.Deterministic('regr', beta_r[0]*ksi[obs_idx, 0] + beta_r[1]*ksi[obs_idx, 1] +
                                   beta_r[2]*ksi[obs_idx, 2] + beta_r[3]*ksi[obs_idx, 3])

    m0 = tau[0] + ksi[obs_idx, 0]*lambdas_1[0]
    m1 = tau[1] + ksi[obs_idx, 0]*lambdas_1[1]
    m2 = tau[2] + ksi[obs_idx, 0]*lambdas_1[2]
    m3 = tau[3] + ksi[obs_idx, 1]*lambdas_2[0]
    m4 = tau[4] + ksi[obs_idx, 1]*lambdas_2[1]
    m5 = tau[5] + ksi[obs_idx, 1]*lambdas_2[2]
    m6 = tau[6] + ksi[obs_idx, 2]*lambdas_3[0]
    m7 = tau[7] + ksi[obs_idx, 2]*lambdas_3[1]
    m8 = tau[8] + ksi[obs_idx, 2]*lambdas_3[2]
    m9 = tau[9] + ksi[obs_idx, 3]*lambdas_4[0]
    m10 = tau[10] + ksi[obs_idx, 3]*lambdas_4[1]
    m11 = tau[11] + ksi[obs_idx, 3]*lambdas_4[2]
    m12 = tau[12] + regression*lambdas_5[0]
    m13 = tau[13] + regression*lambdas_5[1]
    m14 = tau[14] + regression*lambdas_5[2]
    
    mu = pm.Deterministic('mu', pm.math.stack([m0, m1, m2, m3, m4, m5, m6, m7,
                                              m8, m9, m10, m11, m12, m13, m14]).T)
    _  = pm.Normal('likelihood', mu, Psi, observed=df[drivers].values)

    idata = pm.sample(chains=6, nuts_sampler='numpyro', target_accept=.95, tune=1000,
                      idata_kwargs={"log_likelihood": True})
    idata.extend(pm.sample_posterior_predictive(idata))

    return model, idata


model1, idata1 = make_sem(priors={'eta': 2, 'lambda': [1, 10]})
az.plot_posterior(idata1, var_names=['beta_r'])
array([<Axes: title={'center': 'beta_r\nSE_ACAD'}>,
       <Axes: title={'center': 'beta_r\nSE_SOCIAL'}>,
       <Axes: title={'center': 'beta_r\nSUP_F'}>,
       <Axes: title={'center': 'beta_r\nSUP_P'}>], dtype=object)

Citation

BibTeX citation:
@online{forde,
  author = {Nathaniel Forde},
  title = {Measurement, {Latent} {Factors} and {Theory} {Construction}},
  langid = {en}
}
For attribution, please cite this work as:
Nathaniel Forde. n.d. “Measurement, Latent Factors and Theory Construction.”